1 Info

Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.

2 Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
library(patchwork)

3 Data: load in and cleaning

Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.

3.1 Normal light

3.1.1 1st classifier/data (2020-2021)

Note: there are only few individuals of the small ciliates because only cells bigger than 1000um2 are tracked at this magnification and most small ciliates (Tetrahymena, Dexiostoma and Loxocephalus) are smaller than that.

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Coleps_irchel/Morph_mvt.RData")
dd16 <- morph_mvt

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Euplotes/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Euplotes_second/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Paramecium_bursaria/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Dexiostoma/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Loxocephallus/Morph_mvt.RData")
dd16 <- rbind(dd16, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/16x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd16 <- rbind(dd16, morph_mvt)

# filtering
dd16 <- dd16 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)

dd16$magnification <- 1.6

# table(dd16$species)

3.1.2 2nd classifier data (2022 Feb) - normal light

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Coleps_irchel/Morph_mvt.RData")
dd16_2022feb <- morph_mvt

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Dexiostoma/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Euplotes/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Loxochephalus//Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Paramecium_bursaria/Morph_mvt.RData")
# morph_mvt %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   geom_vline(xintercept = 3000)
morph_mvt <- morph_mvt %>% dplyr::filter(mean_area>3000)
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Paramecium_caudatum/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Stylonychia2_batch1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch1"
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Stylonychia2_batch2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch2"
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/16x/Tetrahymena/Morph_mvt.RData")
dd16_2022feb <- rbind(dd16_2022feb, morph_mvt)

# filtering

dd16_2022feb <- dd16_2022feb %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_2022feb$magnification <- 1.6

# table(dd16_2022feb$species)

3.1.3 3rd data (20220301) - normal light

load("../../Class_18C_normalLight/3rd_data_20220301/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220301 <- morph_mvt

load("../../Class_18C_normalLight/3rd_data_20220301/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220301 <- rbind(dd16_20220301, morph_mvt)

# filtering

dd16_20220301 <- dd16_20220301 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220301$magnification <- 1.6

# dd16_20220301 %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd16_20220301 <- dd16_20220301 %>%
  dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000),
                !(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1500))

# table(dd16_20220301$species)

3.1.4 4th data (20220316) - normal light

load("../../Class_18C_normalLight/4th_data_20220316/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220316 <- morph_mvt

load("../../Class_18C_normalLight/4th_data_20220316/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220316 <- rbind(dd16_20220316, morph_mvt)

# filtering

dd16_20220316 <- dd16_20220316 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220316$magnification <- 1.6

# dd16_20220316 %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd16_20220316 <- dd16_20220316 %>%
  dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000),
                !(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))

# table(dd16_20220316$species)

3.1.5 5th data (20220406) - normal light

load("../../Class_18C_normalLight/5th_data_20220406/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220406 <- morph_mvt

load("../../Class_18C_normalLight/5th_data_20220406/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220406 <- rbind(dd16_20220406, morph_mvt)

# filtering

dd16_20220406 <- dd16_20220406 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220406$magnification <- 1.6

# dd16_20220406 %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd16_20220406 <- dd16_20220406 %>%
  dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000),
                !(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))

# table(dd16_20220406$species)

3.2 Decreasing light

3.2.1 2nd classifier data (2022 Feb) - 18 perc

load("../../Class_18C_decreasingLight/Light_18perc/data/16x/Euplotes/Morph_mvt.RData")
dd16_2022feb_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_18perc/data/16x/Paramecium_bursaria/Morph_mvt.RData")
a <- morph_mvt %>%
  ggplot(aes(mean_area))+
  geom_histogram()

Pburs_dark_16x <- morph_mvt %>%
  dplyr::select(mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
                sd_major,mean_ar,sd_ar,
                mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
                min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed)

set.seed(23)
Pburs_dark_16x_clust <- kmeans(Pburs_dark_16x, centers = 2, nstart = 25, iter.max = 1000)
Pburs_dark_16x$cluster <- as.factor(Pburs_dark_16x_clust$cluster)
b <- Pburs_dark_16x %>%
  ggplot(aes(mean_area, col=cluster, fill=cluster))+
  geom_histogram(position = "identity", alpha=0.3)

Pburs_dark_16x_clust <- kmeans(Pburs_dark_16x, centers = 3, nstart = 25, iter.max = 1000)
Pburs_dark_16x$cluster <- as.factor(Pburs_dark_16x_clust$cluster)
c <- Pburs_dark_16x %>%
  ggplot(aes(mean_area, col=cluster, fill=cluster))+
  geom_histogram(position = "identity", alpha=0.3)

Pburs_dark_16x_clust <- kmeans(Pburs_dark_16x, centers = 4, nstart = 25, iter.max = 1000)
Pburs_dark_16x$cluster <- as.factor(Pburs_dark_16x_clust$cluster)
d <- Pburs_dark_16x %>%
  ggplot(aes(mean_area, col=cluster, fill=cluster))+
  geom_histogram(position = "identity", alpha=0.3)

morph_mvt <- morph_mvt %>%
  dplyr::mutate(cluster=as.factor(Pburs_dark_16x_clust$cluster)) %>%
  dplyr::filter(cluster !="2") %>%
  dplyr::select(-cluster)

e <- morph_mvt %>%
  ggplot(aes(mean_area))+
  geom_histogram()

# a + b + c + d + e + plot_annotation(title = "Cleaning of P.burs dark 16x")

dd16_2022feb_decrease <- rbind(dd16_2022feb_decrease, morph_mvt)

# filtering

dd16_2022feb_decrease <- dd16_2022feb_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_2022feb_decrease$magnification <- 1.6

# table(dd16_2022feb_decrease$species)

3.2.2 3rd data (20220301) - 10 perc

load("../../Class_18C_decreasingLight/Light_10perc/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220301_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_10perc/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220301_decrease <- rbind(dd16_20220301_decrease, morph_mvt)

# filtering

dd16_20220301_decrease <- dd16_20220301_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220301_decrease$magnification <- 1.6

# dd16_20220301_decrease %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd16_20220301_decrease <- dd16_20220301_decrease %>%
  dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2500),
                !(species %in% c("Coleps_irchel") & mean_area<1000),
                !(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1500))

# table(dd16_20220301_decrease$species)

3.2.3 4th data (20220316) - 6 perc

load("../../Class_18C_decreasingLight/Light_6perc/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220316_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_6perc/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220316_decrease <- rbind(dd16_20220316_decrease, morph_mvt)

# filtering

dd16_20220316_decrease <- dd16_20220316_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220316_decrease$magnification <- 1.6

# dd16_20220316_decrease %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd16_20220316_decrease <- dd16_20220316_decrease %>%
  dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000),
                !(species %in% c("Coleps_irchel") & mean_area<1000),
                !(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))

# table(dd16_20220316_decrease$species)

3.2.4 5th data (20220406) - 1 perc

load("../../Class_18C_decreasingLight/Light_1perc/data/16x/large_ciliates/Morph_mvt.RData")
dd16_20220406_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_1perc/data/16x/small_ciliates/Morph_mvt.RData")
dd16_20220406_decrease <- rbind(dd16_20220406_decrease, morph_mvt)

# filtering

dd16_20220406_decrease <- dd16_20220406_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd16_20220406_decrease$magnification <- 1.6

# dd16_20220406_decrease %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd16_20220406_decrease <- dd16_20220406_decrease %>%
  dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000),
                !(species %in% c("Coleps_irchel") & mean_area<1000),
                !(species %in% c("Dexiostoma","Tetrahymena","Loxocephallus") & mean_area>1400))

# table(dd16_20220406_decrease$species)

4 Further data Processing

4.1 Merge data

dd16$species <- ifelse(dd16$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                       "Smaller_ciliates",dd16$species)
dd16_2022feb$species <- ifelse(dd16_2022feb$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                               "Smaller_ciliates",dd16_2022feb$species)
dd16_20220301$species <- ifelse(dd16_20220301$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                                "Smaller_ciliates",dd16_20220301$species)
dd16_20220301_decrease$species <- ifelse(dd16_20220301_decrease$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                                         "Smaller_ciliates",dd16_20220301_decrease$species)
dd16_20220316$species <- ifelse(dd16_20220316$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                                "Smaller_ciliates",dd16_20220316$species)
dd16_20220316_decrease$species <- ifelse(dd16_20220316_decrease$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                                         "Smaller_ciliates",dd16_20220316_decrease$species)
dd16_20220406$species <- ifelse(dd16_20220406$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                                "Smaller_ciliates",dd16_20220406$species)
dd16_20220406_decrease$species <- ifelse(dd16_20220406_decrease$species %in% c("Tetrahymena","Dexiostoma","Loxocephallus"),
                                         "Smaller_ciliates",dd16_20220406_decrease$species)


dd16$data <- "Late 2020"
dd16_2022feb$data <- "20220201"
dd16_2022feb_decrease$data <- "20220201"
dd16_20220301$data <- "20220301"
dd16_20220301_decrease$data <- "20220301"
dd16_20220316$data <- "20220316"
dd16_20220316_decrease$data <- "20220316"
dd16_20220406$data <- "20220406"
dd16_20220406_decrease$data <- "20220406"

dd_30perc <- rbind(dd16,dd16_2022feb,
                   dd16_20220301 %>% dplyr::select(-Light_dark),
                   dd16_20220316 %>% dplyr::select(-Light_dark),
                   dd16_20220406 %>% dplyr::select(-Light_dark))
dd_18perc <- dd16_2022feb_decrease
dd_10perc <- dd16_20220301_decrease %>%
  dplyr::select(-Light_dark)
dd_6perc <- dd16_20220316_decrease %>%
  dplyr::select(-Light_dark)
dd_1perc <- dd16_20220406_decrease %>%
  dplyr::select(-Light_dark)

dd_30perc$light <- "30%"
dd_18perc$light <- "18%"
dd_10perc$light <- "10%"
dd_6perc$light <- "6%"
dd_1perc$light <- "1%"

dd <- rbind(dd_30perc,dd_18perc,dd_10perc,dd_6perc,dd_1perc)
dd$species <- ifelse(dd$species == "Stylonychia_2", "Stylonychia2", dd$species)
dd$species <- ifelse(dd$species %in% c("Stylonychia2_batch1","Stylonychia2_batch2"), "Stylonychia2", dd$species)
# table(dd$species, dd$data, dd$magnification, dd$light)

4.2 Filtering out outliers

If an individual is an outlier in more than 3 traits it gets excluded (about 7% gets excluded). Outlier based on boxplot definition.

dd$id <- 1:nrow(dd)
dd <- dd %>% na.omit()
dd_long <- dd %>%
  dplyr::select(species, mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
                sd_major,mean_ar,sd_ar,
                mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
                min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed, id,
                data, light, magnification) %>%
  pivot_longer(cols=-c(id,species,data,light,magnification), names_to="variable") %>%
  dplyr::group_by(variable,species,data,light,magnification) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))

outliers <- dd_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id, data, light, magnification) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>3)

# table(outliers$species)
# nrow(outliers)/nrow(dd)

dd_filtered <- dd %>%
  dplyr::filter(!is.element(id,outliers$id))

dd$removed_outliers <- F
dd_filtered$removed_outliers <- T

dd_comparison <- rbind(dd,dd_filtered)

dd <- dd_filtered 

# dd_comparison %>%
#   dplyr::filter(magnification==1.6) %>%
#   ggplot(aes((mean_area), col=removed_outliers))+
#   geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
#   # geom_boxplot(outlier.alpha = 0.3) +
#   theme_bw() +
#   facet_wrap(species~interaction(light,data), scales = "free")
#  
# dd_filtered %>%
#   ggplot(aes(log10(mean_area)))+
#   geom_density(aes(col=species))

# table(dd$species, dd$data, dd$magnification, dd$light)

4.3 Sub-sampling to create training data

We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.

dd <- dd %>%
  mutate(data.light = interaction(data,light, drop = T),
         data.light.species=interaction(data,light,species, drop = T))

print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##                
##                 Coleps_irchel Colpidium Euplotes Paramecium_bursaria
##   20220406.1%              43       189       69                 332
##   20220301.10%           1421       462       68                 140
##   20220201.18%              0         0     1404                1464
##   20220201.30%           2038       559      476                 960
##   20220301.30%           1072       402      106                 141
##   20220316.30%            207       436      286                1461
##   20220406.30%            133       199      236                 763
##   Late 2020.30%          1120       296     1879                2307
##   20220316.6%             329       468      697                1375
##                
##                 Paramecium_caudatum Smaller_ciliates Stylonychia1 Stylonychia2
##   20220406.1%                   199                2            0          237
##   20220301.10%                  744              352            0          922
##   20220201.18%                    0                0            0            0
##   20220201.30%                 1134               75            0         1149
##   20220301.30%                 1308              494            0         2071
##   20220316.30%                 1563               21            0         1243
##   20220406.30%                  566               19            0         1017
##   Late 2020.30%                2038               30          561          347
##   20220316.6%                  1371               33            0          925
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
##       Coleps_irchel           Colpidium            Euplotes Paramecium_bursaria 
##                6363                3011                5221                8943 
## Paramecium_caudatum    Smaller_ciliates        Stylonychia1        Stylonychia2 
##                8923                1026                 561                7911

Of course, Stylo1 is the one we the least individuals, because it went extinct and we do not have it anymore. As it went extinct, it shouldn’t be the limiting factor, I think. For now at least I’m taking 2 * Stylo1_sum as the number of individuals per species to be included (if a species/class has less than that all individuals are included).

Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.

n <- min(colsums)*2
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=1122"
num_ind_per_class <- dd %>% group_by(species) %>% 
  summarize(num_class = length(unique(data.light.species)),
            num_ind_per_class = floor(n/num_class)) %>%
  select(species,num_ind_per_class)

num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species

set.seed(53)

split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
  species <- unique(x$species)
  x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)

print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##                
##                 Coleps_irchel Colpidium Euplotes Paramecium_bursaria
##   20220406.1%              43       140       69                 124
##   20220301.10%            140       140       68                 124
##   20220201.18%              0         0      124                 124
##   20220201.30%            140       140      124                 124
##   20220301.30%            140       140      106                 124
##   20220316.30%            140       140      124                 124
##   20220406.30%            133       140      124                 124
##   Late 2020.30%           140       140      124                 124
##   20220316.6%             140       140      124                 124
##                
##                 Paramecium_caudatum Smaller_ciliates Stylonychia1 Stylonychia2
##   20220406.1%                   140                2            0          140
##   20220301.10%                  140              140            0          140
##   20220201.18%                    0                0            0            0
##   20220201.30%                  140               75            0          140
##   20220301.30%                  140              140            0          140
##   20220316.30%                  140               21            0          140
##   20220406.30%                  140               19            0          140
##   Late 2020.30%                 140               30          561          140
##   20220316.6%                   140               33            0          140
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
##       Coleps_irchel           Colpidium            Euplotes Paramecium_bursaria 
##                1016                1120                 987                1116 
## Paramecium_caudatum    Smaller_ciliates        Stylonychia1        Stylonychia2 
##                1120                 460                 561                1120
trainingdata <- trainingdata[complete.cases(trainingdata),]

# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
                                 p = 0.75, # percentage of data as training
                                 list = FALSE)

testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]

5 Loading in species compositions

species <- unique(dd$species)
compositions <- read_csv("../../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions$Smaller_ciliates <- with(compositions, 
                                      ifelse(Tetrahymena == 1| Dexiostoma == 1| Loxocephallus == 1, 1, 0))

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name
## filterting out Stylo1
compositions_list <- lapply(compositions_list, function(x){
  x[x!="Stylonychia1"]
})

6 Classifiers

6.1 Classifier plotting functions

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}

6.2 Training of classifiers

formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

set.seed(7456)
classifiers_18c_16x <- list()
classifiers_18c_16x_plots <- list()
classifiers_18c_16x_assess_plots <- list()

for(i in 1:length(compositions_list)){
  sub_trainingdata <- trainingdata %>%
    filter(species %in% c(compositions_list[[i]]))
  sub_testdata <- testdata %>%
    filter(species %in% c(compositions_list[[i]]))

  sub_trainingdata$species <- factor(sub_trainingdata$species)
  sub_testdata$species <- factor(sub_testdata$species)

  # split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
  # subsamples <- lapply(split_up, function(x) {
  #   x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
  # sub_trainingdata <- do.call("rbind", subsamples)

  # A stratified random split of the data
  # idx_train <- createDataPartition(sub_trainingdata$species,
  #                                  p = 0.7, # percentage of data as training
  #                                  list = FALSE)
  # 
  # sub_testdata <- sub_trainingdata[-idx_train,]
  # sub_trainingdata <- sub_trainingdata[idx_train,]
  
  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c_16x[[i]] <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  classifiers_18c_16x_plots[[i]] <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  if(length(unique(sub_testdata$species))==2){
    cf2 <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                                  sub_testdata$species, positive=as.character(unique(sub_testdata$species)[2]))
    cf3 <- rbind(cf$byClass,cf2$byClass)
    rownames(cf3) <- unique(sub_testdata$species)
    cf$byClass <- cf3
  }
  
  classifiers_18c_16x_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}

names(classifiers_18c_16x_plots) <- names(classifiers_18c_16x) <- 
  names(classifiers_18c_16x_assess_plots) <- comp_name

6.3 Assessing of classifiers

There are mainly 4 measures:

  • Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)

  • Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)

  • Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)

  • Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)

PPV is the most important for us!

library(patchwork)
for(i in 1:length(classifiers_18c_16x_plots)){
  print(classifiers_18c_16x_plots[[i]] + classifiers_18c_16x_assess_plots[[i]] + 
    plot_layout(widths = c(4,2)))
}

7 Save classifiers

saveRDS(classifiers_18c_16x, "svm_video_classifiers_18c_16x_20220316_MergedData.rds")
# cl <- readRDS("classifiers_18c_16x.rds")